Here is an example plot of an ellipse at cladogenesis. The plotting function includes all necessary information about the cladogenetic scenario, and plots what happens in “real” space.
x <- 0
y <- 0
r <- 3
s <- .5
a <- 3
d <- 1
m <- 1
c <- .5
hind <- 2
hvals <- c(0,pi/4,pi/2,3*pi/4,pi,5*pi/4,3*pi/2,7*pi/4)
h <- hvals[hind]
z <- 5
alpha <- -3
scenario_plot <- plot_scenario_annotated(x,y,r,s,a,d,m,c,h,z,alpha)
ggsave(paste0(plot_directory,"example_scenario.png"),scenario_plot,dpi=600,width=7,height=5.5)
These are the direction lines (in PCA and map form) for modern-day ellipses.
# PCA
pca_data <- ellipse_data[c("r","s")]
pca <- princomp(pca_data)
prop_var <- c(pca$sdev[1]^2/(pca$sdev[1]^2+pca$sdev[2]^2),pca$sdev[2]^2/(pca$sdev[1]^2+pca$sdev[2]^2))
# PCA Plot
plot_pca <- ggplot(ellipse_data) +
lims(x=c(-30,30),y=c(-30,30)) +
labs(x="West ------- East", y="South ------- North") +
geom_segment(x=0,y=0,xend=ellipse_data$r,yend=ellipse_data$s,alpha=0.5,color=lines_color) +
geom_segment(x=0,y=0,xend=-1*ellipse_data$r,yend=-1*ellipse_data$s,alpha=0.5,color=lines_color) +
geom_segment(x=0,y=0,xend=pca$loadings[1,2]*prop_var[2]*20,yend=pca$loadings[2,2]*prop_var[2]*20,col=color_1,arrow=arrow(length=unit(0.25,"cm")),lwd=1.5) +
geom_segment(x=0,y=0,xend=-1*pca$loadings[1,2]*prop_var[2]*20,yend=-1*pca$loadings[2,2]*prop_var[2]*20,col=color_1,arrow=arrow(length=unit(0.25,"cm")),lwd=1.5) +
geom_segment(x=0,y=0,xend=pca$loadings[1,1]*prop_var[1]*20,yend=pca$loadings[2,1]*prop_var[1]*20,col=color_2,arrow=arrow(length=unit(0.25,"cm")),lwd=1.5) +
geom_segment(x=0,y=0,xend=-1*pca$loadings[1,1]*prop_var[1]*20,yend=-1*pca$loadings[2,1]*prop_var[1]*20,col=color_2,arrow=arrow(length=unit(0.25,"cm")),lwd=1.5) +
annotate("text",x=30,y=30,hjust=1,label=paste0("PC1: ",round(prop_var[1],3)*100,"%, PC2: ",round(prop_var[2],3)*100,"%"),color=lines_color) +
theme_bw() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
axis.title = element_text(color=lines_color),
panel.grid = element_blank(),
panel.border = element_rect(color=lines_color,linewidth=2),
plot.background = element_rect(fill=background_color,color=background_color),
panel.background = element_rect(fill=background_color),
axis.title.x = element_text(size=16),
axis.title.y = element_text(size=16),
aspect.ratio = 1)
ggsave(paste0(plot_directory,"elongation_pca.png"),plot_pca,dpi=600)
# Vector Plot
plot_with_vectors <- ggplot(spheno_shapes) +
geom_sf(fill=lines_color,color=NA,alpha=0.04) +
geom_segment(x=ellipse_data$x,y=ellipse_data$y,xend=ellipse_data$x+ellipse_data$r*.05,yend=ellipse_data$y+ellipse_data$s*.05,color=color_1,lwd=.75) +
geom_segment(x=ellipse_data$x,y=ellipse_data$y,xend=ellipse_data$x-ellipse_data$r*.05,yend=ellipse_data$y-ellipse_data$s*.05,color=color_1,lwd=.75) +
theme_void() +
theme(panel.border = element_rect(color=lines_color,linewidth=2),
panel.background = element_rect(fill=background_color))
ggsave(paste0(plot_directory,"australia_with_vectors.png"),plot_with_vectors,dpi=600)
# Combining
vectors_and_map <- plot_pca + plot_with_vectors +
plot_annotation(theme = theme(plot.background = element_rect(color=background_color,fill=background_color,linewidth=2)))
ggsave(paste0(plot_directory,"vectors_and_map.png"),vectors_and_map,dpi=600,width=7,height=3.5)
These are just toy results from a brief analysis that was nowhere near convergence. I put it here as an example. Direction line plots are averaging across the entire posterior sample for all nodes.
# PROCESSING DIRECTION LINES -- DO NOT RERUN WIHTOUT NEW ANALYSIS
# data_h_long <- unlist(data_h[-1],use.names=FALSE)
# start_ind <- ncol(data_h) + 2
# end_ind <- ncol(data_r)
# data_r_long <- unlist(data_r[start_ind:end_ind],use.names=FALSE)
# data_s_long <- unlist(data_s[start_ind:end_ind],use.names=FALSE)
# direction_data <- data.frame(cbind(h=data_h_long, r=data_r_long, s=data_s_long))
#
# process_direction <- function(h,r,s,z,hval_extended){
# theta <- hval_extended[h+1]
# fx <- cos(theta)
# fy <- sin(theta)
# fz <- r * fx + s * fy
# proj_x <- x + fx + r * fz / z^2
# proj_y <- y + fy + s * fz / z^2
# scale <- sqrt(proj_x^2 + proj_y^2)
# true_x <- proj_x/scale
# true_y <- proj_y/scale
# true_angle <- atan2(true_y,true_x)
# if (true_angle < 0) {true_angle <- 2 * pi + true_angle}
# true_selection <- which(abs(hval_extended - true_angle) == min(abs(hval_extended - true_angle))) - 1
# if (true_selection == (length(hval_extended)-1)) {true_selection <- 0}
# if (true_selection == 4) {true_selection <- 0}
# if (true_selection == 5) {true_selection <- 1}
# if (true_selection == 6) {true_selection <- 2}
# if (true_selection == 7) {true_selection <- 3}
# direction_info <- c(true_x,true_y,true_angle,true_selection)
# }
#
# hval_extended <- c(0,pi/4,pi/2,3*pi/4,pi,5*pi/4,3*pi/2,7*pi/4,2*pi)
# z <- 10
# for (i in 1:nrow(direction_data)) {
# selection <- direction_data$h[i]
# if (selection == 4) {selection <- 0}
# if (selection == 5) {selection <- 1}
# if (selection == 6) {selection <- 2}
# if (selection == 7) {selection <- 3}
# direction_data$selection[i] <- selection
# direction_info <- process_direction(direction_data$h[i],direction_data$r[i],direction_data$s[i],z,hval_extended)
# direction_data$true_x[i] <- direction_info[1]
# direction_data$true_y[i] <- direction_info[2]
# direction_data$true_angle[i] <- direction_info[3]
# direction_data$true_selection[i] <- direction_info[4]
# }
#
# write.csv(direction_data,paste0(data_directory,"processed_directions.csv"),row.names=FALSE,quote=FALSE)
This plot uses raw direction lines. Since daughters may move in either direction along the direction line, we combine direction lines that are symmetric about the origin (e.g. 0 and pi).
# Getting direction line counts
count_0 <- sum(direction_data$selection == 0)
count_1 <- sum(direction_data$selection == 1)
count_2 <- sum(direction_data$selection == 2)
count_3 <- sum(direction_data$selection == 3)
# Performing Chi Square test
p <- chisq.test(c(count_0,count_1,count_2,count_3))$p.value
if (p < 0.001) {
p_string <- paste0("chi-squared test, p < 0.001")
} else {
p_string <- paste0("chi-squared test, p = ",signif(p,digits=3))
}
# Getting densities
lines <- c("0","1","2","3")
counts <- c(count_0,count_1,count_2,count_3)
density <- counts / sum(counts)
count_data <- data.frame(cbind(lines=lines,density=as.numeric(density)))
count_data$density <- as.numeric(count_data$density)
# Plotting bars
bar_plot <- ggplot(data=count_data,aes(x=lines,y=density,fill=lines)) +
geom_col() +
labs(y="Proportion") +
scale_fill_manual(values=c(color_1,color_2,color_3,color_4)) +
labs(title=p_string,color=lines_color) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(color=lines_color,size=16),
axis.ticks.y = element_line(color=lines_color),
axis.title = element_text(color=lines_color),
panel.grid = element_blank(),
panel.border = element_rect(color=lines_color,linewidth=2),
plot.background = element_rect(fill=background_color,color=background_color),
panel.background = element_rect(fill=background_color),
axis.title.x = element_blank(),
axis.title.y = element_text(size=16),
legend.position = "none",
plot.title = element_text(hjust = 1,color=lines_color))
# Direction lines for plotting
line_data <- data.frame(cbind(x=c(0.1,1.2,2.5,3.2),y=c(0,-0.3,-0.3,0.3),xend=c(0.9,1.8,2.5,3.8),yend=c(0,0.3,0.3,-0.3)))
direction_plot <- ggplot(line_data) +
geom_segment(x=line_data$x,xend=line_data$xend,y=line_data$y,yend=line_data$yend,arrow=arrow(ends="both",type="closed",length=unit(0.25,"cm")),linewidth=2,color="black") +
lims(x=c(0,4),y=c(-0.5,0.5)) +
coord_fixed() +
theme_void()
# Combining direction lines with bars
plot_with_arrows <- bar_plot + inset_element(direction_plot,-0.025,-0.05,1.025,0.45)
ggsave(paste0(plot_directory,"direction_probs.png"),plot_with_arrows,dpi=600)
For this plot, rather than using raw direction lines, we use projected direction lines, binning them according to which absolute direction line is closest. Note that 2*pi = 0.
# Getting true direction line counts
count_0 <- sum(direction_data$true_selection == 0)
count_1 <- sum(direction_data$true_selection == 1)
count_2 <- sum(direction_data$true_selection == 2)
count_3 <- sum(direction_data$true_selection == 3)
# Performing Chi Square test
p <- chisq.test(c(count_0,count_1,count_2,count_3))$p.value
if (p < 0.001) {
p_string <- paste0("chi-squared test, p < 0.001")
} else {
p_string <- paste0("chi-squared test, p = ",signif(p,digits=3))
}
# Getting densities
lines <- c("0","1","2","3")
counts <- c(count_0,count_1,count_2,count_3)
density <- counts / sum(counts)
count_data <- data.frame(cbind(lines=lines,density=as.numeric(density)))
count_data$density <- as.numeric(count_data$density)
# Plotting bars
bar_plot <- ggplot(data=count_data,aes(x=lines,y=density,fill=lines)) +
geom_col() +
labs(y="Proportion") +
scale_fill_manual(values=c(color_1,color_2,color_3,color_4)) +
labs(title=p_string,color=lines_color) +
theme_bw() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_text(color=lines_color,size=16),
axis.ticks.y = element_line(color=lines_color),
axis.title = element_text(color=lines_color),
panel.grid = element_blank(),
panel.border = element_rect(color=lines_color,linewidth=2),
plot.background = element_rect(fill=background_color,color=background_color),
panel.background = element_rect(fill=background_color),
axis.title.x = element_blank(),
axis.title.y = element_text(size=16),
legend.position = "none",
plot.title = element_text(hjust = 1,color=lines_color))
# Direction lines for plotting
line_data <- data.frame(cbind(x=c(0.1,1.2,2.5,3.2),y=c(0,-0.3,-0.3,0.3),xend=c(0.9,1.8,2.5,3.8),yend=c(0,0.3,0.3,-0.3)))
direction_plot <- ggplot(line_data) +
geom_segment(x=line_data$x,xend=line_data$xend,y=line_data$y,yend=line_data$yend,arrow=arrow(ends="both",type="closed",length=unit(0.25,"cm")),linewidth=2,color="black") +
lims(x=c(0,4),y=c(-0.5,0.5)) +
coord_fixed() +
theme_void()
# Combining direction lines with bars
plot_with_arrows <- bar_plot + inset_element(direction_plot,-0.025,-0.05,1.025,0.45)
ggsave(paste0(plot_directory,"true_direction_probs.png"),plot_with_arrows,dpi=600)
For this plot, we take the projected direction lines (un-binned) and plot them on a circle. This plot has not been made symmetric (but maybe should)? It doesn’t show much. I want to make a new one with highest posterior estimates instead of full posterior distributions at nodes.
circle <- make_ellipse_coords(0,0,0,0,log(pi),10)
direction_plot <- ggplot(direction_data,aes(x=true_x,y=true_y)) +
geom_polygon(data=circle,aes(x=x,y=y),color="black",fill=NA) +
geom_point(pch=21,stroke=NA,fill="blue",size=4,alpha=0.002) +
coord_fixed() +
theme_void()
ggsave(paste0(plot_directory,"direction_circle.png"),direction_plot,dpi=600)
These are the results of the completed simulations for the coverage experiment – more simulations to come!
# SIMULATION COVERAGE -- DO NOT RERUN WITHOUT NEW SIMULATIONS
# burnin <- 10000
# completed_iterations <- 200000
# params_list <- c("sigma_x","sigma_y","sigma_r","sigma_s","sigma_a","mu","kappa","root_x","root_y","root_r","root_s","root_a")
# simulation_numbers <- sprintf("%03.0f", 1:200)
# nsims <- length(simulation_numbers)
# sim_data <- data.frame(matrix(ncol=9,nrow=0))
# row <- 1
# for (num in simulation_numbers) {
# cat("\r","Simulation ", num, " | ", round(as.numeric(num)/nsims*100,0), "%")
# directory <- paste0(sim_prefix,num,"/")
# iterations <- as.numeric(strsplit(system(paste0("tail -n 1 ",directory,"model_log.tsv"),intern=TRUE),"\t")[[1]][1])
# if (is.na(iterations)) {iterations <- as.numeric(strsplit(system(paste0("tail -n 2 ",directory,"model_log.tsv"),intern=TRUE),"\t")[[1]][1])}
# if (iterations >= completed_iterations) {
# treefile <- ape::read.tree(paste0(directory,"true.tree.txt"))
# tree_size <- length(treefile$tip.label)
# logfile <- read.csv(paste0(directory,"model_log.tsv"),sep="\t",skipNul=TRUE)
# truefile <- strsplit(readLines(paste0(directory,"true.param.tsv")),"\t")
# true_vals <- c()
# for (i in 1:length(truefile)) {
# true_vals$param[i] <- truefile[[i]][1]
# true_vals$vals[i] <- paste(truefile[[i]][-1],collapse=",")
# }
# mcmc <- logfile[-c(1:burnin),]
# for (param in params_list) {
# param_true <- as.numeric(true_vals$vals[which(true_vals$param==param)])
# param_mcmc <- mcmc[[param]]
# param_est <- mean(param_mcmc)
# param_hpd <- hdi(param_mcmc)
# param_hpd_low <- param_hpd[[1]]
# param_hpd_high <- param_hpd[[2]]
# if (param_hpd_low <= param_true & param_true <= param_hpd_high) {param_covered <- TRUE} else {param_covered <- FALSE}
# param_ess <- ess(param_mcmc)
# param_row <- c(num,param,param_true,param_est,param_hpd_low,param_hpd_high,param_covered,param_ess,tree_size)
# sim_data <- rbind(sim_data,param_row)
# row <- row + 1
# }
# }
# }
# colnames(sim_data) <- c("sim","param","true","est","hpd_low","hpd_high","covered","ess","tree_size")
# write.table(sim_data,file=paste0(cluster_directory,"/simulations/output/sim_results.tsv"),quote=FALSE,sep="\t",row.names=FALSE)
# COVERAGES -- do not rerun without new simulations
# sim_data <- read.csv(paste0(cluster_directory,"/simulations/output/sim_results.tsv"),sep="\t")
# sim_data <- sim_data[which(sim_data$ess >= 200),]
#
# plot_param <- function(param, name) {
# data_subset <- sim_data[which(sim_data$param == param),]
# coverage <- round(sum(data_subset$covered==TRUE)/length(data_subset$covered) * 100)
# min <- min(min(data_subset$hpd_low),min(data_subset$true),min(data_subset$est))
# max <- max(max(data_subset$hpd_high),max(data_subset$true),max(data_subset$est))
# plot <- ggplot(data_subset, aes(x=true, y=est)) +
# geom_errorbar(aes(ymin=hpd_low,ymax=hpd_high,col=covered),linewidth=.25,alpha=.5) +
# geom_point(size=.5,pch=16,alpha=.5) +
# geom_abline(slope=1,linewidth=.25) +
# labs(x=NULL,y=NULL) +
# annotate(geom="text",label=name,x=min,y=max,hjust="inward",vjust="inward") +
# annotate(geom="text",label=paste(coverage,"%",sep=""),x=max,y=min,hjust="inward",vjust="inward") +
# lims(x=c(min,max),y=c(min,max)) +
# theme_bw() +
# theme(aspect.ratio=1,panel.grid.minor=element_blank(),panel.grid.major=element_blank(),legend.position="none",plot.margin=margin(4,4,4,4,"pt"))
# return(plot)
# }
#
# plot_sigma_x <- plot_param("sigma_x",bquote(sigma[x]))
# plot_sigma_y <- plot_param("sigma_y",bquote(sigma[y]))
# plot_sigma_r <- plot_param("sigma_r",bquote(sigma[r]))
# plot_sigma_s <- plot_param("sigma_s",bquote(sigma[s]))
# plot_sigma_a <- plot_param("sigma_a",bquote(sigma[a]))
# plot_mu <- plot_param("mu",bquote(mu))
# plot_kappa <- plot_param("kappa",bquote(kappa))
# plot_root_x <- plot_param("root_x",bquote(root[x]))
# plot_root_y <- plot_param("root_y",bquote(root[y]))
# plot_root_r <- plot_param("root_r",bquote(root[r]))
# plot_root_s <- plot_param("root_s",bquote(root[s]))
# plot_root_a <- plot_param("root_a",bquote(root[a]))
#
# coverage_plot <- plot_sigma_x + plot_root_x + plot_sigma_y + plot_root_y + plot_sigma_r + plot_root_r + plot_sigma_s + plot_root_s + plot_sigma_a + plot_root_a + plot_mu + plot_kappa + plot_layout(ncol = 4)
# ggsave(paste0(plot_directory,"coverage.png"),coverage_plot,dpi=600,width=7,height=5)
These simulations represent a scenario where 50% of species either go extinct before the present, or are not sampled for other reasons. As expected, rates of evolution of \(x\) and \(y\) are overestimated, because we fail to observe some cladogenetic events where \(x\) and \(y\) would change. Additionally, the rate of evolution of \(a\) is overestimated. This is likely for the same reason, although the situation is a bit more complex. Generally, cladogenetic scenarios make \(a\) smaller, which is counteracted by \(\mu\) and \(\kappa\) pulling \(a\) toward larger values. If we don’t observe some cladogenetic events, we might assume that \(a\) stays small, and has more time to evolve toward the optimum (which would cause an underestimation of \(\kappa\)). However, we do not see this; the deterministic part of the OU process seems robust to missing cladogenetic events. Instead, we observe an overestimate of \(\sigma_a\). It is also interesting that we overestimate rates of evolution for \(r\) and \(s\), parameters which do not change at cladogenesis. The model is likely attempting to compensate for shifts in \(x\) and \(y\) by expanding the influence of the cladogenetic events which are observed, necessitating more elongated ellipses at those times. Root values are not recovered especially well with or without extinction, although extinction does lead to a few extreme outliers which are not seen when the model is correctly specified (for example, inferring root values for \(x\) and \(y\) on the order of 10,000). It is not clear why this happens.
# SIMULATION SENSITIVITY -- DO NOT RERUN WITHOUT NEW SIMULATIONS
# burnin <- 10000
# completed_iterations <- 200000
# params_list <- c("sigma_x","sigma_y","sigma_r","sigma_s","sigma_a","mu","kappa","root_x","root_y","root_r","root_s","root_a")
# simulation_numbers <- sprintf("%03.0f", 1:100)
# nsims <- length(simulation_numbers)
# sim_data <- data.frame(matrix(ncol=7,nrow=0))
# row <- 1
# for (num in simulation_numbers) {
# cat("\r","Simulation ", num, " | ", round(as.numeric(num)/nsims*100,0), "%")
# directory <- paste0(sim_prefix,num,"/")
# if (file.exists(paste0(directory,"sensitivity/model_log.tsv"))) {
# iterations <- as.numeric(strsplit(system(paste0("tail -n 1 ",directory,"sensitivity/model_log.tsv"),intern=TRUE),"\t")[[1]][1])
# if (is.na(iterations)) {iterations <- as.numeric(strsplit(system(paste0("tail -n 2 ",directory,"sensitivity/model_log.tsv"),intern=TRUE),"\t")[[1]][1])}
# } else {iterations <- 0}
# if (iterations >= completed_iterations) {
# logfile <- read.csv(paste0(directory,"sensitivity/model_log.tsv"),sep="\t",skipNul=TRUE)
# truefile <- strsplit(readLines(paste0(directory,"true.param.tsv")),"\t")
# true_vals <- c()
# for (i in 1:length(truefile)) {
# true_vals$param[i] <- truefile[[i]][1]
# true_vals$vals[i] <- paste(truefile[[i]][-1],collapse=",")
# }
# mcmc <- logfile[-c(1:burnin),]
# for (param in params_list) {
# param_true <- as.numeric(true_vals$vals[which(true_vals$param==param)])
# param_mcmc <- mcmc[[param]]
# param_est <- mean(param_mcmc)
# param_hpd <- hdi(param_mcmc)
# param_hpd_low <- param_hpd[[1]]
# param_hpd_high <- param_hpd[[2]]
# param_ess <- ess(param_mcmc)
# param_row <- c(num,param,param_true,param_est,param_hpd_low,param_hpd_high,param_ess)
# sim_data <- rbind(sim_data,param_row)
# row <- row + 1
# }
# }
# }
# colnames(sim_data) <- c("sim","param","true","est","hpd_low","hpd_high","ess")
# write.table(sim_data,file=paste0(cluster_directory,"/simulations/output/sensitivity_results.tsv"),quote=FALSE,sep="\t",row.names=FALSE)
# SENSITIVITY -- do not rerun without new simulations
# sim_data <- read.csv(paste0(cluster_directory,"/simulations/output/sensitivity_results.tsv"),sep="\t")
# sim_data <- sim_data[which(sim_data$ess >= 200),]
#
# plot_param <- function(param, name) {
# data_subset <- sim_data[which(sim_data$param == param),]
# high_vals <- sort(data_subset$hpd_high)
# low_vals <- sort(data_subset$hpd_low)
# range <- c(low_vals[3],high_vals[length(high_vals)-2])
# high_limit <- high_vals[length(high_vals)-2] + .5 * (range[2] - range[1])
# low_limit <- low_vals[3] - .5 * (range[2] - range[1])
# keep <- c()
# for (i in 1:nrow(data_subset)) {
# keep[i] <- TRUE
# if (data_subset$hpd_high[i] > high_limit) {
# keep[i] <- FALSE
# }
# if (data_subset$hpd_low[i] < low_limit) {
# keep[i] <- FALSE
# }
# }
# n_outliers <- sum(keep==FALSE)
# data_subset <- data_subset[keep==TRUE,]
# min <- min(min(data_subset$hpd_low),min(data_subset$true),min(data_subset$est))
# max <- max(max(data_subset$hpd_high),max(data_subset$true),max(data_subset$est))
# plot <- ggplot(data_subset, aes(x=true, y=est)) +
# geom_errorbar(aes(ymin=hpd_low,ymax=hpd_high),linewidth=.25,alpha=.5) +
# geom_point(size=.5,pch=16,alpha=.5) +
# geom_abline(slope=1,linewidth=.25) +
# labs(x=NULL,y=NULL) +
# annotate(geom="text",label=name,x=min,y=max,hjust="inward",vjust="inward") +
# annotate(geom="text",label=paste("Outliers: ",n_outliers),x=max,y=min,hjust="inward",vjust="inward") +
# lims(x=c(min,max),y=c(min,max)) +
# theme_bw() +
# theme(aspect.ratio=1,panel.grid.minor=element_blank(),panel.grid.major=element_blank(),legend.position="none",plot.margin=margin(4,4,4,4,"pt"))
# return(plot)
# }
#
# plot_sigma_x <- plot_param("sigma_x",bquote(sigma[x]))
# plot_sigma_y <- plot_param("sigma_y",bquote(sigma[y]))
# plot_sigma_r <- plot_param("sigma_r",bquote(sigma[r]))
# plot_sigma_s <- plot_param("sigma_s",bquote(sigma[s]))
# plot_sigma_a <- plot_param("sigma_a",bquote(sigma[a]))
# plot_mu <- plot_param("mu",bquote(mu))
# plot_kappa <- plot_param("kappa",bquote(kappa))
# plot_root_x <- plot_param("root_x",bquote(root[x]))
# plot_root_y <- plot_param("root_y",bquote(root[y]))
# plot_root_r <- plot_param("root_r",bquote(root[r]))
# plot_root_s <- plot_param("root_s",bquote(root[s]))
# plot_root_a <- plot_param("root_a",bquote(root[a]))
#
# sensitivity_plot <- plot_sigma_x + plot_root_x + plot_sigma_y + plot_root_y + plot_sigma_r + plot_root_r + plot_sigma_s + plot_root_s + plot_sigma_a + plot_root_a + plot_mu + plot_kappa + plot_layout(ncol = 4)
# ggsave(paste0(plot_directory,"sensitivity.png"),sensitivity_plot,dpi=600,width=7,height=5)
Many of these rase analyses did not converge. Some had no variability at all, and many had very low ESS. Also, some of the estimates were very poor. I need to examine these, and see if I can improve the rase analyses to provide a fair comparison. At the very least, I will perform rase on the remainder of the 200 simulations. So far, it looks like EMPIRE performs much better than RASE on data simulated under EMPIRE (to be expected). Interestingly, RASE not only overestimates rates (expected), but also misestimates root values substantially (less expected).
# SIMULATION RASE -- DO NOT RERUN WITHOUT NEW SIMULATIONS
# params_list <- c("sigma_x","sigma_y","root_x","root_y")
# sim_data <- read.csv(paste0(cluster_directory,"/simulations/output/sim_results.tsv"),sep="\t")
# sim_data <- sim_data[which(sim_data$ess >= 200),]
# sim_data <- sim_data[which(sim_data$param %in% params_list),]
# sim_data <- sim_data[,c(1:4)]
# sim_data <- cbind(sim_data,rep(NA,nrow(sim_data)),rep(NA,nrow(sim_data)))
# colnames(sim_data) <- c("sim","param","true","EMPIRE","rase","ess")
# burnin <- 100
# simulation_numbers <- sprintf("%03.0f", 1:100)
# nsims <- length(simulation_numbers)
# for (num in simulation_numbers) {
# cat("\r","Simulation ", num, " | ", round(as.numeric(num)/nsims*100,0), "%")
# directory <- paste0(sim_prefix,num,"/")
# logfile <- read.csv(paste0(directory,"rase.csv"),sep=",",skipNul=TRUE)
# root_node <- (ncol(logfile))/2 + 1
# root_x_colname <- paste0("n",root_node,"_x")
# root_y_colname <- paste0("n",root_node,"_y")
# root_x_ind <- which(colnames(logfile)==root_x_colname)
# root_y_ind <- which(colnames(logfile)==root_y_colname)
# sigma2x_ind <- which(colnames(logfile)=="sigma2x")
# sigma2y_ind <- which(colnames(logfile)=="sigma2y")
# mcmc <- logfile[-c(1:burnin),c(root_x_ind,root_y_ind,sigma2x_ind,sigma2y_ind)]
# colnames(mcmc) <- c("root_x","root_y","sigma2_x","sigma2_y")
# mcmc$sigma_x <- lapply(mcmc$sigma2_x,"sqrt")
# mcmc$sigma_y <- lapply(mcmc$sigma2_y,"sqrt")
# for (param in params_list) {
# row <- which(sim_data$sim==as.numeric(num) & sim_data$param==param)
# if (length(row)==1) {
# param_mcmc <- unlist(mcmc[[param]])
# param_est <- mean(param_mcmc)
# param_ess <- ess(param_mcmc)
# sim_data$rase[row] <- param_est
# sim_data$ess[row] <- param_ess
# }
# }
# }
# sim_data <- sim_data[which(!is.na(sim_data$rase)),]
# write.table(sim_data,file=paste0(cluster_directory,"/simulations/output/rase_results.tsv"),quote=FALSE,sep="\t",row.names=FALSE)
# RASE -- do not rerun without new simulations
# sim_data <- read.csv(paste0(cluster_directory,"/simulations/output/rase_results.tsv"),sep="\t")
# sim_data <- sim_data[which(sim_data$ess >= 25),]
#
# plot_param <- function(param, name) {
# data_subset <- sim_data[which(sim_data$param == param),]
# model_data <- data.frame(cbind(true=c(data_subset$true,data_subset$true),est=c(data_subset$EMPIRE,data_subset$rase),model=c(rep("EMPIRE",nrow(data_subset)),rep("RASE",nrow(data_subset)))))
# model_data$true <- as.numeric(model_data$true)
# model_data$est <- as.numeric(model_data$est)
# quantiles <- quantile(model_data$est,c(.25,.75),names=FALSE)
# min <- quantiles[1] - (quantiles[2] - quantiles[1])
# max <- quantiles[2] + (quantiles[2] - quantiles[1])
# keep <- c()
# outliers <- c()
# for (i in 1:nrow(model_data)) {
# #keep[i] <- TRUE
# if (model_data$est[i] > max) {
# #keep[i] <- FALSE
# outliers <- c(outliers,model_data$model[i])
# }
# if (model_data$est[i] < min) {
# #keep[i] <- FALSE
# outliers <- c(outliers,model_data$model[i])
# }
# }
# n_rase_outliers <- sum(outliers=="RASE")
# n_empire_outliers <- sum(outliers=="EMPIRE")
# bottom_text <- paste0(
# "RASE Outliers: ",n_rase_outliers,"\n",
# "EMPIRE Outliers: ",n_empire_outliers,"\n"
# )
# #model_data <- model_data[keep==TRUE,]
# model_data$model <- as.factor(model_data$model)
# plot <- ggplot(model_data, aes(x=true, y=est)) +
# geom_point(pch=16,aes(color=model)) +
# guides(color=guide_legend(override.aes=list(size=5))) +
# geom_abline(slope=1,linewidth=.25) +
# labs(x=NULL,y=NULL,color="Model:") +
# annotate(geom="text",label=name,x=min,y=max,hjust="inward",vjust="inward") +
# annotate(geom="text",label=bottom_text,x=max,y=min,hjust="inward",vjust="inward") +
# lims(x=c(min,max),y=c(min,max)) +
# theme_bw() +
# theme(aspect.ratio=1,panel.grid.minor=element_blank(),panel.grid.major=element_blank(),plot.margin=margin(4,4,4,4,"pt"))
# return(plot)
# }
#
# plot_sigma_x <- plot_param("sigma_x",bquote(sigma[x]))
# plot_sigma_y <- plot_param("sigma_y",bquote(sigma[y]))
# plot_root_x <- plot_param("root_x",bquote(root[x]))
# plot_root_y <- plot_param("root_y",bquote(root[y]))
#
# rase_plot <- plot_sigma_x + plot_root_x + plot_sigma_y + plot_root_y + plot_layout(ncol = 2, guides="collect") & theme(legend.position="bottom")
# ggsave(paste0(plot_directory,"rase.png"),rase_plot,dpi=600,width=7,height=7.5)
This plot uses the same simulations as the coverage plot, but instead shows the quality of estimates based on tree sizes. Most trees (all between 20 and 250 taxa) behave similarly. With smaller trees, it is somewhat easier to estimate the root, and somewhat more difficult to estimate rate parameters. This is what we would expect.
# TREE SIZES -- do not rerun without new simulations
# sim_data <- read.csv(paste0(cluster_directory,"/simulations/output/sim_results.tsv"),sep="\t")
# sim_data <- sim_data[which(sim_data$ess >= 200),]
#
# plot_param <- function(param, name) {
# data_subset <- sim_data[which(sim_data$param == param),]
# min <- min(min(data_subset$hpd_low),min(data_subset$true),min(data_subset$est))
# max <- max(max(data_subset$hpd_high),max(data_subset$true),max(data_subset$est))
# plot <- ggplot(data_subset, aes(x=true, y=est)) +
# geom_point(size=.5,pch=16,aes(color=tree_size)) +
# #scale_color_gradient(low="yellow",high="blue") +
# geom_abline(slope=1,linewidth=.25) +
# labs(x=NULL,y=NULL) +
# annotate(geom="text",label=name,x=min,y=max,hjust="inward",vjust="inward") +
# lims(x=c(min,max),y=c(min,max)) +
# theme_bw() +
# theme(aspect.ratio=1,panel.grid.minor=element_blank(),panel.grid.major=element_blank(),plot.margin=margin(4,4,4,4,"pt"))
# return(plot)
# }
#
# plot_sigma_x <- plot_param("sigma_x",bquote(sigma[x]))
# plot_sigma_y <- plot_param("sigma_y",bquote(sigma[y]))
# plot_sigma_r <- plot_param("sigma_r",bquote(sigma[r]))
# plot_sigma_s <- plot_param("sigma_s",bquote(sigma[s]))
# plot_sigma_a <- plot_param("sigma_a",bquote(sigma[a]))
# plot_mu <- plot_param("mu",bquote(mu))
# plot_kappa <- plot_param("kappa",bquote(kappa))
# plot_root_x <- plot_param("root_x",bquote(root[x]))
# plot_root_y <- plot_param("root_y",bquote(root[y]))
# plot_root_r <- plot_param("root_r",bquote(root[r]))
# plot_root_s <- plot_param("root_s",bquote(root[s]))
# plot_root_a <- plot_param("root_a",bquote(root[a]))
#
# treesize_plot <- plot_sigma_x + plot_root_x + plot_sigma_y + plot_root_y + plot_sigma_r + plot_root_r + plot_sigma_s + plot_root_s + plot_sigma_a + plot_root_a + plot_mu + plot_kappa + plot_layout(ncol = 4) + plot_layout(guides="collect") & labs(color="Tree Size") & theme(legend.position="bottom") & scale_color_gradient(low="yellow",high="blue",aesthetics="color",limits=c(20,250))
# ggsave(paste0(plot_directory,"tree_size.png"),treesize_plot,dpi=600,width=7,height=5.5)